For this project sector fund Fidelity Select Technology Portfolio (FSPTX) is chosen as the target fund. On Fidelity’s website, this is categorized as: Large Growth
For comparison, Russel2000(^RUT), NASDAQ(^IXIC), S&P500(^GSPC),S&P500ITsector,S&PNorthAmericaTechSector S&PMidCap(^MID) and S&PSmlCap(^SML) was selected as initial indexes to be campared with.
for comparison, Vanguard’s similar index funds are also loaded as IT ETF(VGT),LargeCap ETF(VIGAX) and TotalMarket ETF(VTSAX). In these ETFs VGT serves as MSCI US IM Info. Tech. 25/50, VIGAX as CRSP US Large Cap Growth Index and VTSAX serves asCRSP US Total Market Index.
Before loading data, we will define some useful function to ease the data cleaning process, and claim some variables 1st.
## this function requires a dataframe input that has the daily close price named:Closed and a date column named: Date with format as: floating point and "xxxx(year)-xx(month)-xx(day)"
## We trimmed the data from 1990-10-07 because 1990-10-08 is a Monday and Stock market closed during weekend
dailynlogReturn <- function(Date1,DataFrame){
DataFrame = mutate(DataFrame,
dailyReturn = (Close-lag(Close))/Close,
log.Close = log(Close),
log.Return = log.Close-lag(log.Close))%>%
mutate(perc_dailyRe = round(dailyReturn*100.0,3))%>%
filter(Date >= Date1)%>%
filter(Date <= as.Date("2018-12-31"))
}
## This function returns a projection value of the fund from the start date and assuming 10k investment from the start and reinvest all earnings
getProjectionValue <- function(DF){
P0 = pull(filter(DF, Date == pull(top_n(DF["Date"],-1)))%>%select(Close))
DF = mutate(DF,ProjValper10k = (Close*10000)/P0)
}
## Calculate Euclidean distances between two sets of data
sqerr <- function(x,y){
z = x - y
z = sqrt(dot(z,z)/length(y))
return(z)
}
## function for standarize NAV
standardizedNAV = function(DF){
return(mutate(DF,Close.z = (Close-mean(Close))/sd(Close)))
}
## restrict ourselves to study data after 2014-01-01
StartDate = as.Date("2014-01-01")
Load the data:
FSPTX = dailynlogReturn(StartDate,read_csv("FSPTX.csv"))
NASDAQ = dailynlogReturn(StartDate,read_csv("^IXIC.csv"))
SnP500 = dailynlogReturn(StartDate,read_csv("^GSPC.csv"))
SnPMID = dailynlogReturn(StartDate,read_csv("^MID.csv"))
SnPSML = dailynlogReturn(StartDate,read_csv("^SML.csv"))
RUSSELL2000 = dailynlogReturn(StartDate,read_csv("^RUT.csv"))
VGT = dailynlogReturn(StartDate,read_csv("VGT.csv"))
VIGAX = dailynlogReturn(StartDate,read_csv("VIGAX.csv"))
VTSAX = dailynlogReturn(StartDate,read_csv("VTSAX.csv"))
SnP500Info <- dailynlogReturn(StartDate,read_csv("SnP500Info.csv"))
SnPNATech <- dailynlogReturn(StartDate,read_csv("SnPNATECH_clean.csv"))
Check DailyReturn Anormlies and set them to NA value
## Compare daily returns
dailyReturnComp = cbind(as.Date(FSPTX$Date),FSPTX$dailyReturn,NASDAQ$dailyReturn,SnP500$dailyReturn,VGT$dailyReturn,VIGAX$dailyReturn,VTSAX$dailyReturn)
colnames(dailyReturnComp) = c("Date","FSPTX","NASDAQ","SnP500","VGT","VIGAX","VTSAX")
epsilon = 0.000000000000000001
dailyReturnComp = data.frame(dailyReturnComp)%>%mutate(Date = as_date(Date),vsNASDAQ = ifelse(NASDAQ*NASDAQ<=epsilon,FSPTX,FSPTX/NASDAQ),vsSnP500 = ifelse(SnP500*SnP500<=epsilon,FSPTX,FSPTX/SnP500),vsVGT = ifelse(VGT*VGT<=epsilon,FSPTX,FSPTX/VGT),vsVIGAX = ifelse(VIGAX*VIGAX<=epsilon,FSPTX,FSPTX/VIGAX),vsVTSAX = ifelse(VTSAX*VTSAX<=epsilon,FSPTX,FSPTX/VTSAX))%>%mutate(minusNASDAQ = FSPTX-NASDAQ,minusSnP500 = FSPTX-SnP500,minusVGT = FSPTX-VGT,minusVIGAX= FSPTX-VIGAX,minusVTSAX = FSPTX-VTSAX)
plotly::plotly_build(ggplot(dailyReturnComp)+
aes(x = Date,y = minusNASDAQ)+geom_point(alpha = .1)+
geom_smooth(method = "loess",se = TRUE))
DividenDates <- dailyReturnComp%>%dplyr::filter(minusNASDAQ < -0.025)%>%dplyr::select(Date)%>%pull()
FSPTX <- FSPTX%>%dplyr::filter(!Date %in% DividenDates)
NASDAQ <- NASDAQ%>%dplyr::filter(!Date %in% DividenDates)
SnP500 <- SnP500%>%dplyr::filter(!Date %in% DividenDates)
SnPMID <- SnPMID%>%dplyr::filter(!Date %in% DividenDates)
SnPSML <- SnPSML%>%dplyr::filter(!Date %in% DividenDates)
RUSSELL2000 <- RUSSELL2000%>%dplyr::filter(!Date %in% DividenDates)
VGT <- VGT%>%dplyr::filter(!Date %in% DividenDates)
VIGAX <- VIGAX%>%dplyr::filter(!Date %in% DividenDates)
VTSAX <- VTSAX%>%dplyr::filter(!Date %in% DividenDates)
SnP500Info <- SnP500Info%>%dplyr::filter(!Date %in% DividenDates)
SnPNATech <- SnPNATech%>%dplyr::filter(!Date %in% DividenDates)
DailyReturncor = cbind(FSPTX$log.Return,
NASDAQ$log.Return,
RUSSELL2000$log.Return,
SnP500$log.Return,
SnP500Info$log.Return,
SnPNATech$log.Return,
SnPMID$log.Return,
SnPSML$log.Return,
VGT$log.Return,
VIGAX$log.Return,
VTSAX$log.Return)
colnames(DailyReturncor) = c("FSPTX",
"NASDAQ",
"RUSSELL2000",
"SnP500",
"SnP500Info",
"SnPNATech",
"SnPMID",
"SnPSML",
"VGT(IT ETF)",
"VIGAX(LargeCAP)",
"VTSAX(TotalMarket)")
DailyReturncor = data.frame(DailyReturncor)
#print("DailyReturn Correlation")
#cor(DailyReturncor,DailyReturncor)
colmat <- colorRampPalette(c("red", "white", "blue"))
corrplot::corrplot(cor(DailyReturncor,DailyReturncor),cl.lim = c(0.6,1.0),is.corr = FALSE,col = colmat(100),title = "Daily Log Return cor",type = "lower",tl.cex = .8,mar=c(1,1,2,1))
logreturncor <- data.frame(cor(DailyReturncor,DailyReturncor))
logreturncor%>%dplyr::select(1)
The best matches according to daily log return is: SnPNATech,SnP500Info and NASDAQ. Besides, VGT.IT.ETF also is highly correlated in terms of daily log return.
logreturnmodel <- lm(FSPTX$log.Return~NASDAQ$log.Return + SnP500Info$log.Return + SnPNATech$log.Return+VGT$log.Return - 1)
summary(logreturnmodel)
##
## Call:
## lm(formula = FSPTX$log.Return ~ NASDAQ$log.Return + SnP500Info$log.Return +
## SnPNATech$log.Return + VGT$log.Return - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0186969 -0.0016586 -0.0000369 0.0016204 0.0115334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## NASDAQ$log.Return 0.23270 0.03739 6.223 6.64e-10 ***
## SnP500Info$log.Return -0.61227 0.06703 -9.134 < 2e-16 ***
## SnPNATech$log.Return 0.74973 0.06231 12.032 < 2e-16 ***
## VGT$log.Return 0.65983 0.07663 8.611 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.002829 on 1247 degrees of freedom
## Multiple R-squared: 0.9378, Adjusted R-squared: 0.9376
## F-statistic: 4698 on 4 and 1247 DF, p-value: < 2.2e-16
plotly::plotly_build(ggplot(logreturnmodel)+
aes(x = .fitted,y = .stdresid)+geom_point()+
geom_abline(intercept = 2.0,slope = 0.0, linetype = "dashed")+
geom_abline(intercept = -2.0,slope = 0.0, linetype = "dashed")+
ylab("Standardized Residuals")+
xlab("Fitted Values")+
ggtitle("Standardized Residual Plot"))
#plot(logreturnmodel,which = 2)
plotly::plotly_build(ggplot(logreturnmodel)+
aes(sample = .stdresid)+
stat_qq() + stat_qq_line(linetype = "dashed")+
ylab("Standardized Residuals")+
xlab("Theoretical")+
ggtitle("QQnorm Plot"))
Save the predicted value as the index composit
regressiontable <- cbind(as.character.Date(FSPTX$Date),as.numeric(FSPTX$log.Return),as.numeric(logreturnmodel$fitted.values))
colnames(regressiontable) <- c("Date","FSPTX","composit")
regressiontable <- data.frame(regressiontable)
regressiontable <- regressiontable%>%mutate(Date = as.Date(Date),FSPTX = as.numeric(as.character(FSPTX)),composit = as.numeric(as.character(composit)))
regressiontable <-regressiontable%>%mutate(direction = if_else(FSPTX*composit > 0, 1, 0))
ggplot(regressiontable)+aes(x = composit, y = direction)+geom_point()
directionmodel <- glm(data = regressiontable, direction~composit,family = binomial(link = "logit"))
summary(directionmodel)
##
## Call:
## glm(formula = direction ~ composit, family = binomial(link = "logit"),
## data = regressiontable)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1763 0.4424 0.4473 0.4524 0.4878
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.24212 0.09603 23.35 <2e-16 ***
## composit 3.47735 8.69290 0.40 0.689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 790.71 on 1250 degrees of freedom
## Residual deviance: 790.55 on 1249 degrees of freedom
## AIC: 794.55
##
## Number of Fisher Scoring iterations: 5
fitted.y = fitted(directionmodel); observed.y= regressiontable$direction
perf<- ROCR::performance(ROCR::prediction(fitted.y,observed.y) ,"tpr","fpr")
ROCR::plot(perf); abline(0,1,lty=2)
binnedplot(predict(directionmodel),resid(directionmodel))
plotly::plotly_build(ggplot(regressiontable)+aes(x = FSPTX-composit)+geom_histogram(bins = 70,aes(y = ..density..),alpha = .5)+geom_density()+xlab("log return of FSPTX - log return of composit"))